{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "lMZC2Cxd79L0",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "7ea27ba9-72f8-426b-db12-80339e476126"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "# # Mounting drive for running the code through Google colab \n",
        "# from google.colab import drive\n",
        "# drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3Xcicu_phAXv"
      },
      "source": [
        "**The Python version we use: 3.7.13**"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1divkIWYwSR2"
      },
      "source": [
        "# 1- Tokenizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5duRggBRZKvP"
      },
      "outputs": [],
      "source": [
        "# We won't need TensorFlow here\n",
        "!pip uninstall -y tensorflow\n",
        "\n",
        "# Install `transformers` from master\n",
        "# Install huggingface-hub-0.8.1 pyyaml-6.0 tokenizers-0.12.1 transformers-4.22.0.dev0\n",
        "!pip install git+https://github.com/huggingface/transformers\n",
        "!pip list | grep -E 'transformers|tokenizers'"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from tokenizers import SentencePieceBPETokenizer\n",
        "\n",
        "tokenizer = SentencePieceBPETokenizer()"
      ],
      "metadata": {
        "id": "xG-80mengvxG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "paths = '/ecg_train.txt' ###path to the train_dataset\n",
        "special_tokens = [\"<s>\", \"<pad>\", \"</s>\", \"<unk>\", \"<cls>\", \"<sep>\", \"<mask>\"]"
      ],
      "metadata": {
        "id": "Kl_AJAAI4V11"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer.train(\n",
        "    paths,\n",
        "    vocab_size=52_000,\n",
        "    min_frequency=2, #The minimum frequency a pair should have in order to be merged.\n",
        "    show_progress=True,\n",
        "    limit_alphabet=100, #The maximum different characters to keep in the alphabet.\n",
        "    special_tokens=special_tokens\n",
        ")\n",
        "print(\"voc size\", tokenizer.get_vocab_size())"
      ],
      "metadata": {
        "id": "Hum6Cw3og_Jz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QqgEAf33uoFH"
      },
      "source": [
        "# Making Our Tokenizer "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8c8ZoiKaoddw"
      },
      "outputs": [],
      "source": [
        "address = \"./HeartBert\" #Replace your local address here"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import transformers\n",
        "tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=tokenizer, special_tokens=special_tokens) "
      ],
      "metadata": {
        "id": "PGeqj391UvDZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer.bos_token = \"<s>\"\n",
        "tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids(\"<s>\")\n",
        "tokenizer.pad_token = \"<pad>\"\n",
        "tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(\"<pad>\")\n",
        "tokenizer.eos_token = \"</s>\"\n",
        "tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(\"</s>\")\n",
        "tokenizer.unk_token = \"<unk>\"\n",
        "tokenizer.unk_token_id = tokenizer.convert_tokens_to_ids(\"<unk>\")\n",
        "tokenizer.cls_token = \"<cls>\"\n",
        "tokenizer.cls_token_id = tokenizer.convert_tokens_to_ids(\"<cls>\")\n",
        "tokenizer.sep_token = \"<sep>\"\n",
        "tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids(\"<sep>\")\n",
        "tokenizer.mask_token = \"<mask>\"\n",
        "tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids(\"<mask>\")"
      ],
      "metadata": {
        "id": "VNl7xv6tWM6h"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#!mkdir HeartBert\n",
        "tokenizer.save_pretrained(address)"
      ],
      "metadata": {
        "id": "axnbFx0JXPOi"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pNDLrd0Kwsu7"
      },
      "source": [
        "# 2- Heart MLM Model "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2YdPEFn9w50q"
      },
      "source": [
        "# HeartBert---training phase"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kD140sFjh0LQ"
      },
      "outputs": [],
      "source": [
        "# # Check that we have a GPU (for colab)\n",
        "# !nvidia-smi"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VNZZs-r6iKAV"
      },
      "outputs": [],
      "source": [
        "# # Check that PyTorch sees it\n",
        "import torch\n",
        "torch.cuda.is_available()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qxkrbo8yzUMr"
      },
      "source": [
        "# 2-1-  Model Config \n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LTXXutqeDzPi"
      },
      "outputs": [],
      "source": [
        "from transformers import RobertaConfig\n",
        "\n",
        "config = RobertaConfig(\n",
        "    vocab_size=52_000,\n",
        "    max_position_embeddings=514,\n",
        "    num_attention_heads=12,\n",
        "    num_hidden_layers=6,\n",
        "    type_vocab_size=1,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JP9pG6_w1aq-"
      },
      "source": [
        "# 2-2- Tokenizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4keFBUjQFOD1"
      },
      "outputs": [],
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(address) "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mHXMRACL1wI-"
      },
      "source": [
        "# 2-3- MLM definition "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BzMqR-dzF4Ro"
      },
      "outputs": [],
      "source": [
        "from transformers import RobertaForMaskedLM\n",
        "\n",
        "model = RobertaForMaskedLM(config=config)\n",
        "print(model.num_parameters())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "URwYg4gP2ZZ6"
      },
      "source": [
        "# 2-4- Dataset Generation"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from torch.utils.data import Dataset\n",
        "import os\n",
        "from typing import Dict\n",
        "import torch\n",
        "\n",
        "class LineByLineTextDataset(Dataset):\n",
        "    # tokenizer: PreTrainedTokenizer,\n",
        "    def __init__(self, file_path: str, block_size: int):\n",
        "        if os.path.isfile(file_path) is False:\n",
        "            raise ValueError(f\"Input file path {file_path} not found\")\n",
        "        \n",
        "        lines= []\n",
        "        with open(file_path, encoding=\"utf-8\") as f:\n",
        "            # lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]\n",
        "            for line in f:\n",
        "              line= line.strip('\\n')\n",
        "              if len(line)!=0:\n",
        "                if line.isspace():\n",
        "                  print('this line contains space!')\n",
        "                lines.append(line)\n",
        "\n",
        "        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)\n",
        "        self.examples = batch_encoding[\"input_ids\"]\n",
        "        self.examples = [{\"input_ids\": torch.tensor(e, dtype=torch.long)} for e in self.examples]\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.examples)\n",
        "\n",
        "    def __getitem__(self, i) -> Dict[str, torch.tensor]:\n",
        "        return self.examples[i]\n"
      ],
      "metadata": {
        "id": "hgs_meAf19pA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GlvP_A-THEEl"
      },
      "outputs": [],
      "source": [
        "%%time\n",
        "########## Replace your file_path to train, val, and test files\n",
        "bs = 128\n",
        "\n",
        "train_dataset = LineByLineTextDataset(\n",
        "    # tokenizer=tokenizer,\n",
        "    file_path= '/ecg_train.txt',\n",
        "    block_size=bs,\n",
        ")\n",
        "\n",
        "eval_dataset = LineByLineTextDataset(\n",
        "    # tokenizer=tokenizer,\n",
        "    file_path='/ecg_val.txt',\n",
        "    block_size=bs,\n",
        ")\n",
        "\n",
        "test_dataset = LineByLineTextDataset(\n",
        "    # tokenizer=tokenizer,\n",
        "    file_path='/ecg_test.txt',\n",
        "    block_size=bs,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zTgWPa9Dipk2"
      },
      "outputs": [],
      "source": [
        "from transformers import DataCollatorForLanguageModeling\n",
        "\n",
        "data_collator = DataCollatorForLanguageModeling(\n",
        "    tokenizer=tokenizer, mlm=True, mlm_probability=0.15\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "E9qroc8a3TMe"
      },
      "source": [
        "# 2-5- Train"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "######################################################################################\n",
        "################################### ATTENTION!########################################\n",
        "### If this is the first time you run, run this cell; otherwise, skip this cell. ###\n",
        "######################################################################################\n",
        "\n",
        "import pickle\n",
        "\n",
        "path_loss_eval = '/loss_eval.pickle' #path to loss_eval.pickle\n",
        "path_loss_train = '/loss_train.pickle' #path to loss_train.pickle\n",
        "\n",
        "loss_eval = []\n",
        "loss_train = []\n",
        "\n",
        "with open(path_loss_train, 'wb') as handle:\n",
        "  pickle.dump(loss_train, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
        "\n",
        "with open(path_loss_eval, 'wb') as handle:\n",
        "  pickle.dump(loss_eval, handle, protocol=pickle.HIGHEST_PROTOCOL)"
      ],
      "metadata": {
        "id": "3F3Up_AETv0P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import TrainerCallback\n",
        "import pickle\n",
        "\n",
        "path_loss_eval = '/loss_eval.pickle' #path to loss_eval.pickle\n",
        "path_loss_train = '/loss_train.pickle' #path to loss_train.pickle\n",
        "\n",
        "\n",
        "with open(path_loss_train, 'rb') as handle:\n",
        "  loss_train = pickle.load(handle)\n",
        "\n",
        "with open(path_loss_eval, 'rb') as handle:\n",
        "  loss_eval = pickle.load(handle)\n",
        "\n",
        "# the server can be lost in the middle of saving losses so we use the minimum length of stored losses between train and eval\n",
        "if len(loss_train)!=len(loss_eval):\n",
        "  print('difference between length of train loss and eval loss :(')\n",
        "  loss_train = loss_train[0:min(len(loss_train),len(loss_eval))]\n",
        "  loss_eval = loss_eval[0:min(len(loss_train),len(loss_eval))]\n",
        "\n",
        "class PrinterCallback(TrainerCallback):\n",
        "  def on_log(self, args, state, control, logs=None, **kwargs):\n",
        "      print('my logs:',logs)\n",
        "      if 'loss' in logs.keys():\n",
        "        loss_train.append(logs['loss'])\n",
        "        with open(path_loss_train, 'wb') as handle:\n",
        "          pickle.dump(loss_train, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
        "      elif 'eval_loss' in logs.keys():\n",
        "        loss_eval.append(logs['eval_loss'])\n",
        "        with open(path_loss_eval, 'wb') as handle:\n",
        "          pickle.dump(loss_eval, handle, protocol=pickle.HIGHEST_PROTOCOL)\n"
      ],
      "metadata": {
        "id": "0sFPNLqIUq2m"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YpvnFFmZJD-N"
      },
      "outputs": [],
      "source": [
        "from transformers import Trainer, TrainingArguments\n",
        "from transformers import EarlyStoppingCallback, IntervalStrategy\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir= address, #your address to the output directory\n",
        "    overwrite_output_dir=True,\n",
        "    num_train_epochs=1000,\n",
        "    per_gpu_train_batch_size=64,\n",
        "    per_gpu_eval_batch_size=64,\n",
        "    # gradient_accumulation_steps = 1,\n",
        "    save_steps=10_000,\n",
        "    save_total_limit=2,\n",
        "    prediction_loss_only=True,\n",
        "    #logging_steps=50,\n",
        "    logging_strategy = 'epoch',                            \n",
        "    evaluation_strategy = IntervalStrategy.EPOCH, \n",
        "    save_strategy = 'epoch',\n",
        "    metric_for_best_model = 'loss',\n",
        "    load_best_model_at_end=True,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "trainer = Trainer(\n",
        "    model=model.to('cuda'),\n",
        "    args=training_args,\n",
        "    data_collator=data_collator,\n",
        "    train_dataset=train_dataset,\n",
        "    eval_dataset=eval_dataset,\n",
        "    callbacks = [PrinterCallback, EarlyStoppingCallback(early_stopping_patience=50, early_stopping_threshold=0)] \n",
        ")"
      ],
      "metadata": {
        "id": "45BbBsIsstJf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VmaHZXzmkNtJ"
      },
      "outputs": [],
      "source": [
        "######################################################################################\n",
        "################################### ATTENTION!########################################\n",
        "### If this is the first time you run, run this cell (skip the next); otherwise, skip this cell (run the next). ###\n",
        "######################################################################################\n",
        "\n",
        "%%time\n",
        "\n",
        "trainer.train() # only first time running "
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "trainer.trainer(resume_from_checkpoint=True) "
      ],
      "metadata": {
        "id": "sOCYcbQPX-8D"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "J6gvCWdDiz-S"
      },
      "source": [
        "# 2-6- Evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0eEF7m8YpuTo"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RPprr3GCIveY"
      },
      "outputs": [],
      "source": [
        "plt.plot(np.arange(1,len(loss_train)+1),loss_train,)\n",
        "plt.plot(np.arange(1,len(loss_eval)+1),loss_eval,)\n",
        "plt.legend(['loss_train','loss_eval'])\n",
        "plt.xlabel('epoch')\n",
        "plt.ylabel('loss')\n",
        "plt.title('loss per epoch')\n",
        "# plt.show()\n",
        "plt.savefig('loss_per_epoch.png')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "33T6qtDsIzOe"
      },
      "outputs": [],
      "source": [
        "class EvaluationMetrics:\n",
        "  def __init__(self,):\n",
        "    pass\n",
        "    #crossentropy is the loss of model wrt input\n",
        "  def perplexity(self, crossentropy):\n",
        "    return np.exp(crossentropy)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "97BvUKCVI2i_"
      },
      "outputs": [],
      "source": [
        "# an instance of class EvaluationMetrics\n",
        "evaluation_metrics = EvaluationMetrics()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dwZ6LIwuI5rr"
      },
      "outputs": [],
      "source": [
        "plt.plot(np.arange(1,len(loss_train)+1),evaluation_metrics.perplexity(loss_train))\n",
        "plt.plot(np.arange(1,len(loss_eval)+1),evaluation_metrics.perplexity(loss_eval))\n",
        "plt.legend(['ppl_train','ppl_eval'])\n",
        "plt.xlabel('epoch')\n",
        "plt.ylabel('ppl')\n",
        "plt.title('ppl per epoch')\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IKq7yk1qJKhO"
      },
      "source": [
        "test"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0B2fsM-XJKGr"
      },
      "outputs": [],
      "source": [
        "# copy loss_eval because when we call trainer.evaluate() on_log append loss of test to list of loss_eval so we have loss of eval loss on first part of the list and loss of test at the end part of the list.\n",
        "# to not confront to this problem first we clear the content of loss_eval and after we use it for test set, we bring back the original value of loss_eval.\n",
        "import copy\n",
        "loss_eval_temp = copy.deepcopy(loss_eval)\n",
        "loss_eval = []\n",
        "\n",
        "loss_test = []\n",
        "for i in range(len(test_dataset)): #check\n",
        "  # print(sample)\n",
        "  o = trainer.evaluate(eval_dataset=test_dataset[i:i+1],)\n",
        "  print(o['eval_loss'])\n",
        "  loss_test.append(o['eval_loss'])\n",
        "\n",
        "loss_eval = copy.deepcopy(loss_eval_temp)\n",
        "\n",
        "plt.plot(np.arange(1,len(loss_test)+1),loss_test)\n",
        "plt.legend(['loss_test'])\n",
        "plt.xlabel('epoch')\n",
        "plt.ylabel('loss')\n",
        "plt.title('loss per epoch')\n",
        "# plt.show()\n",
        "plt.savefig('test_loss_per_epoch.png')\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xTEVI4OgJPhX"
      },
      "outputs": [],
      "source": [
        "plt.plot(np.arange(1,len(loss_test)+1),evaluation_metrics.perplexity(loss_test))\n",
        "plt.legend(['ppl_test'])\n",
        "plt.xlabel('sample')\n",
        "plt.ylabel('ppl')\n",
        "plt.title('ppl per sample')\n",
        "# plt.show()\n",
        "plt.savefig('ppl_per_sample.png')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Qu7AObFYJSyH"
      },
      "outputs": [],
      "source": [
        "loss_eval_temp = copy.deepcopy(loss_eval)\n",
        "loss_eval = []\n",
        "# calculating perplexity of model using  test set. in eval_dataset you can use train, dev or test set\n",
        "print('PPL:',evaluation_metrics.perplexity(trainer.evaluate(eval_dataset=test_dataset,)['eval_loss']))\n",
        "\n",
        "loss_eval = copy.deepcopy(loss_eval_temp)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "h2FsdMVJ4Qjb"
      },
      "source": [
        "# 2-7 Saving model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QDNgPls7_l13"
      },
      "outputs": [],
      "source": [
        "trainer.save_model(aaddress)\n",
        "\n",
        "# Saving model checkpoint to ./HeartBert/mlm_model\n",
        "# Configuration saved in ./HeartBert/mlm_model/config.json\n",
        "# Model weights saved in ./HeartBert/mlm_model/pytorch_model.bin"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "g5mhwS9L5QQT"
      },
      "source": [
        "# 3- Test\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iIQJ8ND_AEhl"
      },
      "source": [
        "Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.\n",
        "\n",
        "Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, `<mask>`) and return a list of the most probable filled sequences, with their probabilities.\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ltXgXyCbAJLY"
      },
      "outputs": [],
      "source": [
        "# from transformers import pipeline\n",
        "\n",
        "# fill_mask = pipeline(\n",
        "#     \"fill-mask\",\n",
        "#     model= address,\n",
        "#     tokenizer= address\n",
        "# )"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "provenance": [],
      "toc_visible": true
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}